{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# EXEC_Web-Crawler_for_Proxy_IP_Jupyter_Nb_GF_2024-07-14.ipynb\n",
    "# Create by GF 2024-07-14 21:57"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "import re\n",
    "# ..................................................\n",
    "import pandas as pd\n",
    "# ..................................................\n",
    "import requests\n",
    "# ..................................................\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "from SCRIPT.PYTHON3 import GF_Crawling_Proxy_IP_from_kuaidaili_com\n",
    "# ..................................................\n",
    "SCRIPT = GF_Crawling_Proxy_IP_from_kuaidaili_com"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Message] Web Crawling: Processing https://www.kuaidaili.com/free/inha/50/\r"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>IP</th>\n",
       "      <th>最后验证时间</th>\n",
       "      <th>PORT</th>\n",
       "      <th>响应速度</th>\n",
       "      <th>位置</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8.212.107.200</td>\n",
       "      <td>2024-07-15 23:30:04</td>\n",
       "      <td>80</td>\n",
       "      <td>174</td>\n",
       "      <td>香港</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>111.206.0.99</td>\n",
       "      <td>2024-07-15 22:30:02</td>\n",
       "      <td>8181</td>\n",
       "      <td>273</td>\n",
       "      <td>北京市</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>47.93.223.11</td>\n",
       "      <td>2024-07-15 21:30:02</td>\n",
       "      <td>7890</td>\n",
       "      <td>170</td>\n",
       "      <td>北京市</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>60.12.168.114</td>\n",
       "      <td>2024-07-15 20:30:03</td>\n",
       "      <td>9002</td>\n",
       "      <td>288</td>\n",
       "      <td>台州市</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>212.107.28.120</td>\n",
       "      <td>2024-07-15 19:30:02</td>\n",
       "      <td>80</td>\n",
       "      <td>197</td>\n",
       "      <td>香港</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>39.129.73.6</td>\n",
       "      <td>2024-06-21 04:30:02</td>\n",
       "      <td>443</td>\n",
       "      <td>251</td>\n",
       "      <td>昭通市</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>47.243.92.199</td>\n",
       "      <td>2024-06-21 03:30:02</td>\n",
       "      <td>3128</td>\n",
       "      <td>148</td>\n",
       "      <td>香港</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>47.243.92.199</td>\n",
       "      <td>2024-06-21 02:30:01</td>\n",
       "      <td>3128</td>\n",
       "      <td>186</td>\n",
       "      <td>香港</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>154.203.132.55</td>\n",
       "      <td>2024-06-21 01:30:01</td>\n",
       "      <td>8080</td>\n",
       "      <td>240</td>\n",
       "      <td>美国 加利福尼亚</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>111.160.204.146</td>\n",
       "      <td>2024-06-21 00:30:02</td>\n",
       "      <td>9091</td>\n",
       "      <td>262</td>\n",
       "      <td>天津市</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>600 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 IP               最后验证时间  PORT  响应速度        位置\n",
       "0     8.212.107.200  2024-07-15 23:30:04    80   174        香港\n",
       "1      111.206.0.99  2024-07-15 22:30:02  8181   273       北京市\n",
       "2      47.93.223.11  2024-07-15 21:30:02  7890   170       北京市\n",
       "3     60.12.168.114  2024-07-15 20:30:03  9002   288       台州市\n",
       "4    212.107.28.120  2024-07-15 19:30:02    80   197        香港\n",
       "..              ...                  ...   ...   ...       ...\n",
       "7       39.129.73.6  2024-06-21 04:30:02   443   251       昭通市\n",
       "8     47.243.92.199  2024-06-21 03:30:02  3128   148        香港\n",
       "9     47.243.92.199  2024-06-21 02:30:01  3128   186        香港\n",
       "10   154.203.132.55  2024-06-21 01:30:01  8080   240  美国 加利福尼亚\n",
       "11  111.160.204.146  2024-06-21 00:30:02  9091   262       天津市\n",
       "\n",
       "[600 rows x 5 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "PDF_List:list = []\n",
    "\n",
    "for i in range(0, 50): # -> range() 含头不含尾。\n",
    "    \n",
    "    Page_Num = (i + 1)\n",
    "    # ..............................................\n",
    "    URL = \"https://www.kuaidaili.com/free/inha/{}/\".format(str(Page_Num))\n",
    "    \n",
    "    # ----------------------------------------------\n",
    "    Response = SCRIPT.Send_Request_and_Get_Response(URL)\n",
    "    \n",
    "    # ----------------------------------------------\n",
    "    Json_Array = SCRIPT.Parse_The_Response_Data_and_Parse_JavaScript_List_as_Tuple_by_BS4(Response, \"fpsList\")\n",
    "    \n",
    "    # ----------------------------------------------\n",
    "    PDF = SCRIPT.Parse_Json_Array_as_Pandas_DataFrame(Json_Array)\n",
    "    # ..............................................\n",
    "    PDF_List.append(PDF)\n",
    "\n",
    "# --------------------------------------------------\n",
    "Proxy_IP_PDF = pd.concat(PDF_List)\n",
    "\n",
    "# --------------------------------------------------\n",
    "Proxy_IP_PDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[Caution] An Error Occurred While Processing: http://47.93.223.11:7890 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://1.162.12.227:80 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://153.101.67.170:9002 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://183.234.215.11:8443 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://119.96.100.63:30000 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://122.116.150.2:9000 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://103.73.66.36:8085 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://47.100.254.82:80 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://119.96.100.63:30000 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://58.20.248.139:9002 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n",
      "[Caution] An Error Occurred While Processing: http://123.205.24.244:80 | The Error is: HTTPSConnectionPool(host='www.baidu.com', port=443): Read timed out. (read timeout=0.1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>IP</th>\n",
       "      <th>最后验证时间</th>\n",
       "      <th>PORT</th>\n",
       "      <th>响应速度</th>\n",
       "      <th>位置</th>\n",
       "      <th>协议</th>\n",
       "      <th>可用性(0.1s)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8.212.107.200</td>\n",
       "      <td>2024-07-15 23:30:04</td>\n",
       "      <td>80</td>\n",
       "      <td>174</td>\n",
       "      <td>香港</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>111.206.0.99</td>\n",
       "      <td>2024-07-15 22:30:02</td>\n",
       "      <td>8181</td>\n",
       "      <td>273</td>\n",
       "      <td>北京市</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>47.93.223.11</td>\n",
       "      <td>2024-07-15 21:30:02</td>\n",
       "      <td>7890</td>\n",
       "      <td>170</td>\n",
       "      <td>北京市</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>60.12.168.114</td>\n",
       "      <td>2024-07-15 20:30:03</td>\n",
       "      <td>9002</td>\n",
       "      <td>288</td>\n",
       "      <td>台州市</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>212.107.28.120</td>\n",
       "      <td>2024-07-15 19:30:02</td>\n",
       "      <td>80</td>\n",
       "      <td>197</td>\n",
       "      <td>香港</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>39.129.73.6</td>\n",
       "      <td>2024-06-21 04:30:02</td>\n",
       "      <td>443</td>\n",
       "      <td>251</td>\n",
       "      <td>昭通市</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>47.243.92.199</td>\n",
       "      <td>2024-06-21 03:30:02</td>\n",
       "      <td>3128</td>\n",
       "      <td>148</td>\n",
       "      <td>香港</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>47.243.92.199</td>\n",
       "      <td>2024-06-21 02:30:01</td>\n",
       "      <td>3128</td>\n",
       "      <td>186</td>\n",
       "      <td>香港</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>154.203.132.55</td>\n",
       "      <td>2024-06-21 01:30:01</td>\n",
       "      <td>8080</td>\n",
       "      <td>240</td>\n",
       "      <td>美国 加利福尼亚</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>111.160.204.146</td>\n",
       "      <td>2024-06-21 00:30:02</td>\n",
       "      <td>9091</td>\n",
       "      <td>262</td>\n",
       "      <td>天津市</td>\n",
       "      <td>HTTP</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>600 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 IP               最后验证时间  PORT  响应速度        位置    协议  \\\n",
       "0     8.212.107.200  2024-07-15 23:30:04    80   174        香港  HTTP   \n",
       "1      111.206.0.99  2024-07-15 22:30:02  8181   273       北京市  HTTP   \n",
       "2      47.93.223.11  2024-07-15 21:30:02  7890   170       北京市  HTTP   \n",
       "3     60.12.168.114  2024-07-15 20:30:03  9002   288       台州市  HTTP   \n",
       "4    212.107.28.120  2024-07-15 19:30:02    80   197        香港  HTTP   \n",
       "..              ...                  ...   ...   ...       ...   ...   \n",
       "7       39.129.73.6  2024-06-21 04:30:02   443   251       昭通市  HTTP   \n",
       "8     47.243.92.199  2024-06-21 03:30:02  3128   148        香港  HTTP   \n",
       "9     47.243.92.199  2024-06-21 02:30:01  3128   186        香港  HTTP   \n",
       "10   154.203.132.55  2024-06-21 01:30:01  8080   240  美国 加利福尼亚  HTTP   \n",
       "11  111.160.204.146  2024-06-21 00:30:02  9091   262       天津市  HTTP   \n",
       "\n",
       "    可用性(0.1s)  \n",
       "0           1  \n",
       "1           1  \n",
       "2           0  \n",
       "3           1  \n",
       "4           1  \n",
       "..        ...  \n",
       "7           1  \n",
       "8           1  \n",
       "9           1  \n",
       "10          1  \n",
       "11          1  \n",
       "\n",
       "[600 rows x 7 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Proxy_IP_PDF[\"协议\"] = str(\"HTTP\")\n",
    "\n",
    "# --------------------------------------------------\n",
    "Proxy_IP_PDF[\"可用性(0.1s)\"] = \\\n",
    "Proxy_IP_PDF.apply(lambda Row: SCRIPT.Check_Proxy_IP_by_Timeout(Protocol=Row[\"协议\"], Proxy_IP=Row[\"IP\"], Port=Row[\"PORT\"]), axis=1)\n",
    "\n",
    "# --------------------------------------------------\n",
    "Proxy_IP_PDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Proxy_IP_PDF.to_csv(\"F:\\\\DATASET\\\\Proxy_IP_kuaidaili_com_Write-Only.csv\", mode='a', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
